#' 
#' reading in:
#'   MPDataset_MPDS2023a.csv  (original data from Manifesto Project) 
#' cleaning
#'   adding/renaming country_isocode and other identifying variables
#'   dropping observations that aren't used in the analysis
#' writing out: 
#'   manifesto_clean.csv  (in process_data folder)
#'
#'

m = read_csv("source_data/MPDataset_MPDS2023a.csv")  


# adding country_isocode variable to m
m = m |> mutate(
  country_isocode = countryname |> 
    countrycode(origin='country.name',destination='iso3c')
)

# two countries not identified by countrycode()
m |> filter(is.na(country_isocode)) |> pull(countryname) |> unique()
# [1] "Northern Ireland"           "German Democratic Republic"

# manual fix for DDR
m = m |> mutate(
  country_isocode = case_when(
    countryname=="German Democratic Republic" ~ 'DDR',
    T ~ country_isocode
  )
)

# drop Northern Ireland
m = m |> filter(!is.na(country_isocode))

# drop Costa Rica and Panama
#  (no militaries during this period, so no ministers of defense)
m = m |> filter(!country_isocode %in% c('CRI','PAN'))

# create new identifier variables
m = m |> rename(manifesto_partyid = party)  |> 
  mutate(
    elec_date = edate |> as.Date(format='%d/%m/%Y'),
    year = year(elec_date),
    country_elec = paste(country_isocode, elec_date, sep=' / '),
    party_iso_date = paste(manifesto_partyid, country_isocode, elec_date, sep = ' / ')
  )

m |> pull(country_elec) |> n_distinct() # 822
m |> pull(party_iso_date) |> n_distinct() # 5000
m |> filter(duplicated(party_iso_date)) # none


# 71 parties that never get presvote or pervote
parties_with_no_votes_ever = m |> summarise(
  n_pervote = sum(!is.na(pervote)),
  n_presvote = sum(!is.na(presvote)),
  .by = c(manifesto_partyid)
) |> filter(n_pervote==0 & n_presvote==0) |> pull(manifesto_partyid)

m |> filter(manifesto_partyid %in% parties_with_no_votes_ever) |> 
  pull(pervote) |> is.na() |> table() # all T, 91

# drop these
m = m |> filter(!manifesto_partyid %in% parties_with_no_votes_ever)



#### WRITING OUT ####

m |> write_csv('process_data/manifesto_clean.csv')

rm(list=ls()[ls()!='replication_wd'])
